#################################################################
rm(list=ls())
#################################################################
# Dependencies
#################################################################
# global
library(dplyr)
library(stringi)
library(stringr)
library(pbapply)
library(pbmcapply)
library(magrittr)
library(stringr)
library(rvest)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(here)
here::here()

# load corpus
load('../output/01-minified-corpus-NER_2021-06-09.RDS')
str(corpus)
corpus %<>% mutate(person.id=strsplit(person.id, ';'))
head(corpus)
# load candidate data
df <- read.csv('../output/00-Named_Entity_List_withID.csv', stringsAsFactors = F) %>% as_tibble
str(df)

df <- df %>% mutate(party = ifelse(party == "GPS", "Grüne", party)) %>% 
  mutate(party = ifelse(party == "GLP", "glp", party))

# add the party regex term for the different languages 
plist <- readRDS('../output/00-partyList.RDS') # load list with party regex

plist <- sapply(1:length(plist[[1]]), function(x) 
  #paste0('\\b', paste0(plist$abbrv[[x]], collapse='\\b|\\b'), '\\b')
  paste0(plist$abbrv[[x]], collapse='|')
) %>% 
  tibble(party.regex=., bindr=names(plist$abbrv))

df <- filter(df, !is.na(party))
df <- left_join(df, plist, by=c('party'='bindr'))


annot <- read.table('../input/curated_party_regex.txt', stringsAsFactors = F, sep = ';', header = T)
df <- left_join(df, annot)


unique(df$party)
df %<>% mutate(party.regex=ifelse(party=='Grüne', paste0(party.regex,'|(G|g)rüne|(v|V)erts|(I|i) (V|v)erdi'), party.regex))



unique(df$party.regex)

# check corpus for party regex 
log <- lapply(1:nrow(corpus), function(x){
  print(x)
  if(!all(is.na(corpus$person.id[[x]]))){
    matchmaker <- df$party.regex[df$id %in% corpus$person.id[[x]]] %>% setNames(., corpus$person.id[[x]])
    person.id2 <- sapply(1:length(matchmaker), function(y) ifelse(grepl(matchmaker[y], corpus$txt[x]), names(matchmaker)[y], NA))
    person.id2 <- na.omit(person.id2)
  }else{
    person.id2 <- NULL
  }
  return(person.id2)
})

corpus$person.id <- log

corpus %<>% filter(lengths(person.id)>0)
nrow(corpus)
head(corpus)
saveRDS(corpus, '../output/02-minified-corpus_2021-06-09-CURATED.RDS')
